In [2]:

    
%matplotlib inline
import numpy as np
import pandas as pd



In [2]:

    
%load ../ud120-projects/final_project/poi_id.py



In [1]:

    
#%%writefile ../ud120-projects/final_project/poi_id.py
#!/usr/bin/python

import matplotlib.pyplot as plt
import sys
import pickle
sys.path.append("../ud120-projects/tools/")

from feature_format import featureFormat
from feature_format import targetFeatureSplit

### features_list is a list of strings, each of which is a feature name
### first feature must be "poi", as this will be singled out as the label
features_list = ['poi', 'salary', 'deferral_payments', 'total_payments', 'loan_advances',
                 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value',
                 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive',
                 'restricted_stock', 'director_fees', 'to_messages', 
                 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 
                 'poi', 'shared_receipt_with_poi']


### load the dictionary containing the dataset
data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )

### we suggest removing any outliers before proceeding further

### if you are creating any new features, you might want to do that here
### store to my_dataset for easy export below
my_dataset = data_dict



### these two lines extract the features specified in features_list
### and extract them from data_dict, returning a numpy array
data = featureFormat(my_dataset, features_list)



### if you are creating new features, could also do that here



### split into labels and features (this line assumes that the first
### feature in the array is the label, which is why "poi" must always
### be first in features_list
labels, features = targetFeatureSplit(data)



### machine learning goes here!
### please name your classifier clf for easy export below

clf = None    ### get rid of this line!  just here to keep code from crashing out-of-box


### dump your classifier, dataset and features_list so 
### anyone can run/check your results
pickle.dump(clf, open("../ud120-projects/final_project/my_classifier.pkl", "w") )
pickle.dump(data_dict, open("../ud120-projects/final_project/my_dataset.pkl", "w") )
pickle.dump(features_list, open("../ud120-projects/final_project/my_feature_list.pkl", "w") )



In [11]:



In [ ]:

    
#%load ../ud120-projects/final_project/tester.py



In [ ]:

    
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py

"""

import pickle
import sys
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

### load up student's classifier, dataset, and feature_list
clf = pickle.load(open("my_classifier.pkl", "r") )
dataset = pickle.load(open("my_dataset.pkl", "r") )
feature_list = pickle.load(open("my_feature_list.pkl", "r"))

### print basic info about the algorithm/parameters used
print clf

### prepare data for training/testing
data = featureFormat(dataset, feature_list)
labels, features = targetFeatureSplit(data)



### stratified k-fold cross-validation is a form of 
### CV where instances of each class are equally apportioned--
### e.g. if you have 10% of one class and 90% of the other,
### stratification means each fold will have 10% of one
### class and 90% of the other
###
### this is helpful when you don't have a lot of instances
### of one class or the other, because in that case the 
### low-frequency class can become lopsided in the training-test
### split skew the results
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.cross_validation import StratifiedKFold
skf = StratifiedKFold( labels, n_folds=3 )
precisions = []
recalls = []
for train_idx, test_idx in skf: 
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    for ii in train_idx:
        features_train.append( features[ii] )
        labels_train.append( labels[ii] )
    for jj in test_idx:
        features_test.append( features[jj] )
        labels_test.append( labels[jj] )
    
    ### fit the classifier using training set, and test on test set
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)


    ### for each fold, print some metrics
    print
    print "precision score: ", precision_score( labels_test, pred )
    print "recall score: ", recall_score( labels_test, pred )

    precisions.append( precision_score(labels_test, pred) )
    recalls.append( recall_score(labels_test, pred) )

### aggregate precision and recall over all folds
print "average precision: ", sum(precisions)/3.
print "average recall: ", sum(recalls)/3.








#print precision_score( labels_test, pred )
#print recall_score( labels_test, pred )



In [4]:

    
data_dict = pickle.load(open("../ud120-projects/final_project/my_dataset.pkl", "r") )



In [56]:

    
#[v for k,v in data_dict.items()][0]



In [5]:

    
data_dict.items()[0]









    Out[5]:





('METTS MARK',
 {'bonus': 600000,
  'deferral_payments': 'NaN',
  'deferred_income': 'NaN',
  'director_fees': 'NaN',
  'email_address': 'mark.metts@enron.com',
  'exercised_stock_options': 'NaN',
  'expenses': 94299,
  'from_messages': 29,
  'from_poi_to_this_person': 38,
  'from_this_person_to_poi': 1,
  'loan_advances': 'NaN',
  'long_term_incentive': 'NaN',
  'other': 1740,
  'poi': False,
  'restricted_stock': 585062,
  'restricted_stock_deferred': 'NaN',
  'salary': 365788,
  'shared_receipt_with_poi': 702,
  'to_messages': 807,
  'total_payments': 1061827,
  'total_stock_value': 585062})



In [6]:

    
df = pd.DataFrame.from_dict(my_dataset, orient='index')



In [7]:

    
#%load ../ud120-projects/tools/feature_format.py



In [8]:

    
df.head()









    Out[8]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      ...
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      email_address
      from_poi_to_this_person
    
  
  
    
      ALLEN PHILLIP K
       201955
       2902
       2869717
       4484442
       1729541
       4175000
        126027
       1407
       -126027
        1729541
      ...
       NaN
       2195
           152
        65
       False
       NaN
       -3081055
        304805
          phillip.allen@enron.com
        47
    
    
      BADUM JAMES P
          NaN
        NaN
        178980
        182466
        257817
           NaN
           NaN
        NaN
           NaN
         257817
      ...
       NaN
        NaN
           NaN
       NaN
       False
       NaN
            NaN
           NaN
                              NaN
       NaN
    
    
      BANNANTINE JAMES M
          477
        566
           NaN
        916197
       4046157
           NaN
       1757552
        465
       -560222
        5243487
      ...
       NaN
         29
        864523
         0
       False
       NaN
          -5104
           NaN
       james.bannantine@enron.com
        39
    
    
      BAXTER JOHN C
       267102
        NaN
       1295738
       5634343
       6680544
       1200000
       3942714
        NaN
           NaN
       10623258
      ...
       NaN
        NaN
       2660303
       NaN
       False
       NaN
       -1386055
       1586055
                              NaN
       NaN
    
    
      BAY FRANKLIN R
       239671
        NaN
        260455
        827696
           NaN
        400000
        145796
        NaN
        -82782
          63014
      ...
       NaN
        NaN
            69
       NaN
       False
       NaN
        -201641
           NaN
              frank.bay@enron.com
       NaN
    
  

5 rows × 21 columns



In [9]:

    
df['salary'].unique()









    Out[9]:





array([201955, 'NaN', 477, 267102, 239671, 80818, 231330, 213999, 216582,
       187922, 213625, 248546, 278601, 248017, 261516, 330546, 240189,
       261809, 415189, 288542, 314288, 184899, 206121, 365163, 492375,
       210500, 250100, 262788, 221003, 210692, 182245, 170941, 304588,
       440698, 199157, 1060932, 192008, 231946, 274975, 272880, 6615,
       374125, 243293, 262663, 211788, 130724, 85274, 288558, 275101,
       404338, 174246, 271442, 309946, 224305, 339288, 1072321, 273746,
       236457, 349487, 263413, 365038, 370448, 365788, 267093, 251654,
       229284, 329078, 94941, 261879, 655037, 197091, 96840, 76399, 420636,
       249201, 304110, 269076, 248146, 211844, 428780, 1111258, 239502,
       162779, 257486, 265214, 222093, 247338, 26704229, 288589, 357091,
       259996, 63744, 510364, 317543, 158403], dtype=object)

'NaN' was imported as a string instead of a a missing value. We will convert these to NaN type and look how many missing values our data has.



In [10]:

    
df = df.replace('NaN', np.nan)



In [11]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Index: 146 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 21 columns):
salary                       95 non-null float64
to_messages                  86 non-null float64
deferral_payments            39 non-null float64
total_payments               125 non-null float64
exercised_stock_options      102 non-null float64
bonus                        82 non-null float64
restricted_stock             110 non-null float64
shared_receipt_with_poi      86 non-null float64
restricted_stock_deferred    18 non-null float64
total_stock_value            126 non-null float64
expenses                     95 non-null float64
loan_advances                4 non-null float64
from_messages                86 non-null float64
other                        93 non-null float64
from_this_person_to_poi      86 non-null float64
poi                          146 non-null bool
director_fees                17 non-null float64
deferred_income              49 non-null float64
long_term_incentive          66 non-null float64
email_address                111 non-null object
from_poi_to_this_person      86 non-null float64
dtypes: bool(1), float64(19), object(1)

There is a lot of missing data!



In [12]:

    
print "NaN - Missing values:"
len(df.index)-df.count()









    



NaN - Missing values:






    Out[12]:





salary                        51
to_messages                   60
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       60
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 60
other                         53
from_this_person_to_poi       60
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
email_address                 35
from_poi_to_this_person       60
dtype: int64

First, check for potential invalid people in the dataset by looking at names without a " ".



In [13]:

    
[suspect for suspect in df.index if " " not in suspect]









    Out[13]:





['TOTAL']

TOTAL is an aggregate category, and not a person's name. This should be removed.



In [14]:

    
df = df.drop('TOTAL', axis=0)

Next, we'll look at names of people who only have 3 or less feature entries (one of which is simply True/False for poi and not a feature) out of 21 features. One happens to be a Travel Agency, and others are missing nearly all entries as well.

These are good candidates for potential removal.



In [15]:

    
print [ind for ind in enumerate(df.T.count()) if ind[1] <= 3]

df.irow([56, 84, 127, 137, 142])









    



[(56, 3), (84, 1), (127, 3), (137, 3), (142, 3)]






    Out[15]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      ...
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      email_address
      from_poi_to_this_person
    
  
  
    
      GRAMM WENDY L
      NaN
      NaN
      NaN
       119292
          NaN
      NaN
      NaN
      NaN
      NaN
          NaN
      ...
      NaN
      NaN
          NaN
      NaN
       False
       119292
      NaN
      NaN
       NaN
      NaN
    
    
      LOCKHART EUGENE E
      NaN
      NaN
      NaN
          NaN
          NaN
      NaN
      NaN
      NaN
      NaN
          NaN
      ...
      NaN
      NaN
          NaN
      NaN
       False
          NaN
      NaN
      NaN
       NaN
      NaN
    
    
      THE TRAVEL AGENCY IN THE PARK
      NaN
      NaN
      NaN
       362096
          NaN
      NaN
      NaN
      NaN
      NaN
          NaN
      ...
      NaN
      NaN
       362096
      NaN
       False
          NaN
      NaN
      NaN
       NaN
      NaN
    
    
      WHALEY DAVID A
      NaN
      NaN
      NaN
          NaN
        98718
      NaN
      NaN
      NaN
      NaN
        98718
      ...
      NaN
      NaN
          NaN
      NaN
       False
          NaN
      NaN
      NaN
       NaN
      NaN
    
    
      WROBEL BRUCE
      NaN
      NaN
      NaN
          NaN
       139130
      NaN
      NaN
      NaN
      NaN
       139130
      ...
      NaN
      NaN
          NaN
      NaN
       False
          NaN
      NaN
      NaN
       NaN
      NaN
    
  

5 rows × 21 columns



In [16]:

    
#df.columns
#df = df.drop(['Name'], axis=1)
df = df.drop(['GRAMM WENDY L', 'THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E', 'WHALEY DAVID A', 'WROBEL BRUCE'], axis=0)

Email address is also not needed for this model as it is a unique string for each person.



In [17]:

    
df = df.drop(['email_address'], axis=1)

First, we must deal with the NaN's since many models don't like missing values. For a quick and dirty solution, we will just fill in 0's for missing values.

This is just to get a model up and running, and will be handled differently later.



In [18]:

    
from sklearn.cross_validation import train_test_split



In [98]:

    
labels = df['poi']
features = df.drop('poi', axis=1)
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, 
                                                                            test_size=0.2,
                                                                            random_state=808)



In [20]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.grid_search import GridSearchCV



In [21]:

    
#param_grid = [{'C':[.0001, .001, .01, 0.1, 1, 10, 100, 1000], 'gamma': [10, 1, .1, .01, .001, .0001]}]



In [116]:

    
from sklearn.cross_validation import StratifiedShuffleSplit



In [122]:

    
train_strat, test_strat = StratifiedShuffleSplit(df.poi, n_iter=10, test_size=0.2)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-122-aa5a2874176c> in <module>()
----> 1 train_strat, test_strat = StratifiedShuffleSplit(df.poi, n_iter=10, test_size=0.2, train_size=.8)

ValueError: too many values to unpack



In [123]:









    



Help on class StratifiedShuffleSplit in module sklearn.cross_validation:

class StratifiedShuffleSplit(BaseShuffleSplit)
 |  Stratified ShuffleSplit cross validation iterator
 |  
 |  Provides train/test indices to split data in train test sets.
 |  
 |  This cross-validation object is a merge of StratifiedKFold and
 |  ShuffleSplit, which returns stratified randomized folds. The folds
 |  are made by preserving the percentage of samples for each class.
 |  
 |  Note: like the ShuffleSplit strategy, stratified random splits
 |  do not guarantee that all folds will be different, although this is
 |  still very likely for sizeable datasets.
 |  
 |  Parameters
 |  ----------
 |  y : array, [n_samples]
 |      Labels of samples.
 |  
 |  n_iter : int (default 10)
 |      Number of re-shuffling & splitting iterations.
 |  
 |  test_size : float (default 0.1), int, or None
 |      If float, should be between 0.0 and 1.0 and represent the
 |      proportion of the dataset to include in the test split. If
 |      int, represents the absolute number of test samples. If None,
 |      the value is automatically set to the complement of the train size.
 |  
 |  train_size : float, int, or None (default is None)
 |      If float, should be between 0.0 and 1.0 and represent the
 |      proportion of the dataset to include in the train split. If
 |      int, represents the absolute number of train samples. If None,
 |      the value is automatically set to the complement of the test size.
 |  
 |  random_state : int or RandomState
 |      Pseudo-random number generator state used for random sampling.
 |  
 |  Examples
 |  --------
 |  >>> from sklearn.cross_validation import StratifiedShuffleSplit
 |  >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
 |  >>> y = np.array([0, 0, 1, 1])
 |  >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)
 |  >>> len(sss)
 |  3
 |  >>> print(sss)       # doctest: +ELLIPSIS
 |  StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...)
 |  >>> for train_index, test_index in sss:
 |  ...    print("TRAIN:", train_index, "TEST:", test_index)
 |  ...    X_train, X_test = X[train_index], X[test_index]
 |  ...    y_train, y_test = y[train_index], y[test_index]
 |  TRAIN: [1 2] TEST: [3 0]
 |  TRAIN: [0 2] TEST: [1 3]
 |  TRAIN: [0 2] TEST: [3 1]
 |  
 |  Method resolution order:
 |      StratifiedShuffleSplit
 |      BaseShuffleSplit
 |      abc.NewBase
 |      __builtin__.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, y, n_iter=10, test_size=0.1, train_size=None, indices=None, random_state=None, n_iterations=None)
 |  
 |  __len__(self)
 |  
 |  __repr__(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmethods__ = frozenset([])
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from BaseShuffleSplit:
 |  
 |  __iter__(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from BaseShuffleSplit:
 |  
 |  indices
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from abc.NewBase:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [ ]:



In [23]:

    
#svm_model = SVC()
#clf = grid_search.GridSearchCV(svm_model, param_grid, n_jobs=4, scoring='f1')



In [23]:



In [25]:

    
#NaN, can't fit yet
#clf.fit(features_train, labels_train)



In [27]:

    
#rf = RandomForestClassifier(n_estimators=1000, n_jobs=4)



In [29]:

    
#NaN, can't fit yet
#rf.fit(features_train, labels_train)



In [93]:



In [34]:

    
#pred = rf.predict(features_test)
#print "Accuracy:", accuracy_score(labels_test, pred), '\n'
#print "Confusion Matrix:\n", confusion_matrix(labels_test, pred), '\n'
#print "Classification Report:", classification_report(labels_test, pred)









    



Accuracy: 0.964285714286 

Confusion Matrix:
[[26  0]
 [ 1  1]] 

Classification Report:              precision    recall  f1-score   support

      False       0.96      1.00      0.98        26
       True       1.00      0.50      0.67         2

avg / total       0.97      0.96      0.96        28



In [32]:

    
#features = np.array(features)
#labels = np.array(labels)
#features_test = np.array(features_test)
#features_train = np.array(features_train)
#labels_test = np.array(labels_test)
#labels_train = np.array(labels_train)



In [107]:









    Out[107]:





16



In [124]:



In [94]:

    
df.head()









    Out[94]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      expenses
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      director_fees
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      ALLEN PHILLIP K
       201955
       2902
       2869717
       4484442
       1729541
       4175000
        126027
       1407
      -126027
        1729541
        13868
      NaN
       2195
           152
       65
      NaN
      -3081055
        304805
       47
    
    
      BADUM JAMES P
          NaN
        NaN
        178980
        182466
        257817
           NaN
           NaN
        NaN
          NaN
         257817
         3486
      NaN
        NaN
           NaN
      NaN
      NaN
           NaN
           NaN
      NaN
    
    
      BANNANTINE JAMES M
          477
        566
           NaN
        916197
       4046157
           NaN
       1757552
        465
      -560222
        5243487
        56301
      NaN
         29
        864523
        0
      NaN
         -5104
           NaN
       39
    
    
      BAXTER JOHN C
       267102
        NaN
       1295738
       5634343
       6680544
       1200000
       3942714
        NaN
          NaN
       10623258
        11200
      NaN
        NaN
       2660303
      NaN
      NaN
      -1386055
       1586055
      NaN
    
    
      BAY FRANKLIN R
       239671
        NaN
        260455
        827696
           NaN
        400000
        145796
        NaN
       -82782
          63014
       129142
      NaN
        NaN
            69
      NaN
      NaN
       -201641
           NaN
      NaN



In [39]:

    
#df[(df.poi == True)].email_address









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-39-74f9d0d84870> in <module>()
----> 1 df[(df.poi == True)].email_address

c:\Anaconda\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
   1841                 return self[name]
   1842             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1843                                  (type(self).__name__, name))
   1844 
   1845     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'email_address'



In [34]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 20 columns):
salary                       94 non-null float64
to_messages                  86 non-null float64
deferral_payments            38 non-null float64
total_payments               122 non-null float64
exercised_stock_options      99 non-null float64
bonus                        81 non-null float64
restricted_stock             109 non-null float64
shared_receipt_with_poi      86 non-null float64
restricted_stock_deferred    17 non-null float64
total_stock_value            123 non-null float64
expenses                     94 non-null float64
loan_advances                3 non-null float64
from_messages                86 non-null float64
other                        91 non-null float64
from_this_person_to_poi      86 non-null float64
poi                          140 non-null bool
director_fees                15 non-null float64
deferred_income              48 non-null float64
long_term_incentive          65 non-null float64
from_poi_to_this_person      86 non-null float64
dtypes: bool(1), float64(19)



In [46]:

    
df.describe()









    Out[46]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      expenses
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      count
           140.000000
         140.000000
           140.000000
       1.400000e+02
            140.000000
           140.000000
            140.000000
        140.000000
            140.000000
            140.000000
          140.000000
            140.000000
         140.000000
            140.000000
       140.000000
             140
          140.000000
           140.000000
           140.000000
       140.000000
    
    
      mean
        190744.492857
        1273.942857
        228434.971429
       2.320163e+06
        2133411.692857
        695311.564286
         893351.614286
        722.685714
          75515.557143
        2991223.428571
        36386.064286
         599464.285714
         373.971429
         303166.835714
        25.328571
       0.1285714
         9413.385714
       -199217.078571
        346585.200000
        39.864286
    
    
      std
        197294.093628
        2259.506178
        763919.760540
       8.965398e+06
        4851626.749684
       1245353.853038
        2039916.501419
       1085.974496
        1320523.687009
        6258245.359442
        45551.830946
        6890811.098023
        1470.070002
        1146367.057329
        80.807030
       0.3359269
        30322.956715
        613764.992934
        694584.937036
        75.042263
    
    
      min
             0.000000
           0.000000
       -102500.000000
       0.000000e+00
              0.000000
             0.000000
       -2604490.000000
          0.000000
       -1787380.000000
         -44093.000000
            0.000000
              0.000000
           0.000000
              0.000000
         0.000000
           False
            0.000000
      -3504386.000000
             0.000000
         0.000000
    
    
      25%
             0.000000
           0.000000
             0.000000
       1.092718e+05
              0.000000
             0.000000
          68704.500000
          0.000000
              0.000000
         337227.250000
            0.000000
              0.000000
           0.000000
              0.000000
         0.000000
               0
            0.000000
        -39072.000000
             0.000000
         0.000000
    
    
      50%
        211816.000000
         430.000000
             0.000000
       9.842120e+05
         644048.000000
        312500.000000
         364374.000000
        118.000000
              0.000000
         986016.500000
        22614.000000
              0.000000
          18.500000
           1038.500000
         0.000000
               0
            0.000000
             0.000000
             0.000000
         7.500000
    
    
      75%
        271801.500000
        1689.250000
         11840.750000
       1.985668e+06
        1735597.250000
        812500.000000
         857103.000000
       1042.000000
              0.000000
        2372703.250000
        54234.500000
              0.000000
          56.750000
         150507.500000
        14.000000
               0
            0.000000
             0.000000
        375304.000000
        42.000000
    
    
      max
       1111258.000000
       15149.000000
       6426990.000000
       1.035598e+08
       34348384.000000
       8000000.000000
       14761694.000000
       5521.000000
       15456290.000000
       49110078.000000
       228763.000000
       81525000.000000
       14368.000000
       10359729.000000
       609.000000
            True
       137864.000000
             0.000000
       5145434.000000
       528.000000



In [47]:

    
import matplotlib.pyplot as plt



In [35]:

    
plt.plot(df.salary.fillna(df.to_messages.median()))
plt.plot(df[df.poi==True].salary)









    Out[35]:





[<matplotlib.lines.Line2D at 0x169c9080>]



In [78]:

    
df.apply(lambda x: x.fillna(x.median()), axis=0).describe()









    Out[78]:






  
    
      
      salary
      to_messages
      deferral_payments
      total_payments
      exercised_stock_options
      bonus
      restricted_stock
      shared_receipt_with_poi
      restricted_stock_deferred
      total_stock_value
      expenses
      loan_advances
      from_messages
      other
      from_this_person_to_poi
      poi
      director_fees
      deferred_income
      long_term_incentive
      from_poi_to_this_person
    
  
  
    
      count
           140.000000
         140.000000
           140.000000
       1.400000e+02
            140.000000
           140.000000
            140.000000
        140.000000
            140.000000
            140.000000
          140.000000
            140.000000
         140.000000
            140.000000
       140.000000
             140
          140.000000
           140.000000
           140.000000
       140.000000
    
    
      mean
        275759.392857
        1741.042857
        389495.521429
       2.462458e+06
        2521323.821429
       1011382.992857
         991022.871429
       1008.307143
         -47716.385714
        3126094.750000
        51680.242857
        2556607.142857
         389.785714
         321222.285714
        28.414286
       0.1285714
       102047.314286
       -299054.821429
        572741.271429
        53.364286
    
    
      std
        145378.632921
        2063.168503
        720551.130069
       8.935969e+06
        4715631.457413
       1116399.113561
        2004813.796551
        945.726877
        1328398.493412
        6203670.569959
        37886.738249
        6723421.969061
        1466.149274
        1141814.707275
        79.922533
       0.3359269
        14144.632209
        584705.403418
        607619.669145
        69.567235
    
    
      min
           477.000000
          57.000000
       -102500.000000
       1.480000e+02
           3285.000000
         70000.000000
       -2604490.000000
          2.000000
       -1787380.000000
         -44093.000000
          148.000000
         400000.000000
          12.000000
              2.000000
         0.000000
           False
         3285.000000
      -3504386.000000
         69223.000000
         0.000000
    
    
      25%
        238740.750000
         897.000000
        221063.500000
       5.610545e+05
         758993.000000
        700000.000000
         338764.250000
        583.000000
        -140264.000000
         643802.000000
        32965.000000
        2000000.000000
          33.750000
           2426.500000
         5.000000
               0
       103750.000000
       -151927.000000
        422158.000000
        25.000000
    
    
      50%
        258741.000000
        1211.000000
        221063.500000
       1.106740e+06
        1324578.000000
        750000.000000
         441096.000000
        740.500000
        -140264.000000
        1110705.000000
        46547.500000
        2000000.000000
          41.000000
          51587.000000
         8.000000
               0
       103750.000000
       -151927.000000
        422158.000000
        35.000000
    
    
      75%
        271801.500000
        1689.250000
        221063.500000
       1.985668e+06
        1735597.250000
        812500.000000
         857103.000000
       1042.000000
        -140264.000000
        2372703.250000
        54234.500000
        2000000.000000
          56.750000
         150507.500000
        14.000000
               0
       103750.000000
       -151927.000000
        422158.000000
        42.000000
    
    
      max
       1111258.000000
       15149.000000
       6426990.000000
       1.035598e+08
       34348384.000000
       8000000.000000
       14761694.000000
       5521.000000
       15456290.000000
       49110078.000000
       228763.000000
       81525000.000000
       14368.000000
       10359729.000000
       609.000000
            True
       137864.000000
          -833.000000
       5145434.000000
       528.000000



In [74]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Index: 140 entries, ALLEN PHILLIP K to YEAP SOON
Data columns (total 20 columns):
salary                       94 non-null float64
to_messages                  86 non-null float64
deferral_payments            38 non-null float64
total_payments               122 non-null float64
exercised_stock_options      99 non-null float64
bonus                        81 non-null float64
restricted_stock             109 non-null float64
shared_receipt_with_poi      86 non-null float64
restricted_stock_deferred    17 non-null float64
total_stock_value            123 non-null float64
expenses                     94 non-null float64
loan_advances                3 non-null float64
from_messages                86 non-null float64
other                        91 non-null float64
from_this_person_to_poi      86 non-null float64
poi                          140 non-null bool
director_fees                15 non-null float64
deferred_income              48 non-null float64
long_term_incentive          65 non-null float64
from_poi_to_this_person      86 non-null float64
dtypes: bool(1), float64(19)



In [101]:

    
plt.plot(df.long_term_incentive, 'ro')
plt.plot(df[df.poi==True].long_term_incentive, 'bo')









    Out[101]:





[<matplotlib.lines.Line2D at 0x1b112400>]



In [36]:

    
df1 = df.drop(['deferral_payments', 'restricted_stock_deferred', 'loan_advances', 'director_fees'], axis=1)



In [37]:

    
f1 = df1.drop(['poi'], axis=1)
y1 = df['poi']



In [38]:

    
from sklearn.preprocessing import scale

We need to fill in the missing values.

The data such as salary, bonus, etc are highly right skewed.
Using mean values will pull the missing values toward the skew.
Using 0 might bias the data toward those near zero.
Many values wouldn't be practical at zero either, such as salary or emails.
Median will be used since sklearn imputation methods are limited. (i.e. random forest imputation or MICE imputation are not implemented)
Median can be filled in per column/variable.



In [40]:

    
f1 = f1.apply(lambda x: x.fillna(x.median()), axis=0)
f_scaled = scale(f1)



In [41]:

    
from sklearn.decomposition import PCA



In [42]:

    
# Keep the number of components that capture 90% of the variance.
pca = PCA(n_components=0.90, whiten=True).fit(f_scaled)



In [43]:

    
# There are 9 principal components that capture at least 90% of the data.
# This is a reduction from 
pca.n_components_









    Out[43]:





9



In [228]:

    
pca.explained_variance_ratio_









    Out[228]:





array([ 0.36579631,  0.20125554,  0.07833407,  0.0663043 ,  0.05844747,
        0.05340822,  0.03832788,  0.03036948,  0.02982602])



In [44]:

    
# Exact total variance captured by 9 principal components is 92.2%
sum(pca.explained_variance_ratio_)









    Out[44]:





0.9220692927102766



In [229]:



In [45]:

    
# Use the pca model to transform our training and testing set features.
x_pca = pca.transform(f_scaled)



In [216]:



In [56]:

    
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
param_grid = {
         'C': [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, .1, 1, 10, 100],
          'gamma': [1e-300, 1e-200, 1e-100, 1e-30, 1e-20, 1e-12, 1e-11, 1e-10, 1e-9, 1e-8, 1e-7],
          }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid,n_jobs=4)
clf = clf.fit(x_pca, y1)



In [57]:

    
print clf.best_estimator_









    



SVC(C=1e-05, cache_size=200, class_weight='auto', coef0=0.0, degree=3,
  gamma=1e-300, kernel='rbf', max_iter=-1, probability=False,
  random_state=None, shrinking=True, tol=0.001, verbose=False)



In [57]:



In [57]:



In [59]:

    
# SVC/SVM
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
y_pred = clf.predict(x_pca)
print classification_report(y1, y_pred)
print confusion_matrix(y1, y_pred)
print accuracy_score(y1, y_pred)









    



             precision    recall  f1-score   support

      False       0.87      1.00      0.93       122
       True       0.00      0.00      0.00        18

avg / total       0.76      0.87      0.81       140

[[122   0]
 [ 18   0]]
0.871428571429



In [90]:

    
#Random Forest
rf = RandomForestClassifier(n_estimators=2000, n_jobs=4, oob_score=True)
rf.fit(x_pca, y1)
y_pred = rf.predict(x_pca)
print classification_report(y1, y_pred)
print confusion_matrix(y1, y_pred)
print accuracy_score(y1, y_pred)









    



             precision    recall  f1-score   support

      False       1.00      1.00      1.00       122
       True       1.00      1.00      1.00        18

avg / total       1.00      1.00      1.00       140

[[122   0]
 [  0  18]]
1.0



In [72]:

    
from operator import itemgetter
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")



In [83]:

    
# !!
# Use oob_score=True and bootstrap=True for OOB estimates for random forests.
# !!
clf = RandomForestClassifier(n_estimators=2000, oob_score=True)



In [84]:

    
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 9],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "criterion": ["gini", "entropy"]}



In [85]:

    
# run grid search
# n_jobs = 4 for parallel processing with 4-cores.
grid_search = GridSearchCV(clf, param_grid=param_grid, pre_dispatch=8, n_jobs=4)



In [85]:



In [86]:

    
grid_search.fit(x_pca, y1)









    Out[86]:





GridSearchCV(cv=None,
       estimator=RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=2000, n_jobs=1,
            oob_score=True, random_state=None, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=4,
       param_grid={'max_features': [1, 3, 9], 'min_samples_split': [1, 3, 10], 'criterion': ['gini', 'entropy'], 'max_depth': [3, None], 'min_samples_leaf': [1, 3, 10]},
       pre_dispatch=8, refit=True, score_func=None, scoring=None,
       verbose=0)



In [87]:

    
report(grid_search.grid_scores_)









    



Model with rank: 1
Mean validation score: 0.871 (std: 0.001)
Parameters: {'max_features': 1, 'min_samples_split': 1, 'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 3}

Model with rank: 2
Mean validation score: 0.871 (std: 0.001)
Parameters: {'max_features': 1, 'min_samples_split': 3, 'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 3}

Model with rank: 3
Mean validation score: 0.871 (std: 0.001)
Parameters: {'max_features': 1, 'min_samples_split': 10, 'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 3}



In [88]:

    
clf = grid_search.best_estimator_
clf









    Out[88]:





RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=3, max_features=1,
            max_leaf_nodes=None, min_density=None, min_samples_leaf=3,
            min_samples_split=1, n_estimators=2000, n_jobs=1,
            oob_score=True, random_state=None, verbose=0)



In [91]:

    
features = x_pca.copy()
labels = y1.copy()
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
skf = StratifiedKFold( labels, n_folds=3 )
precisions = []
recalls = []
for train_idx, test_idx in skf: 
    features_train = []
    features_test  = []
    labels_train   = []
    labels_test    = []
    for ii in train_idx:
        features_train.append( features[ii] )
        labels_train.append( labels[ii] )
    for jj in test_idx:
        features_test.append( features[jj] )
        labels_test.append( labels[jj] )
    
    ### fit the classifier using training set, and test on test set
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)


    ### for each fold, print some metrics
    print
    print "precision score: ", precision_score( labels_test, pred )
    print "recall score: ", recall_score( labels_test, pred )
    print "confusion matrix\n", confusion_matrix(labels_test, pred)

    precisions.append( precision_score(labels_test, pred) )
    recalls.append( recall_score(labels_test, pred) )

### aggregate precision and recall over all folds
print "average precision: ", sum(precisions)/3.
print "average recall: ", sum(recalls)/3.









    



precision score:  0.0
recall score:  0.0
confusion matrix
[[39  2]
 [ 6  0]]

precision score:  0.0
recall score:  0.0
confusion matrix
[[40  1]
 [ 6  0]]

precision score:  0.0
recall score:  0.0
confusion matrix
[[37  3]
 [ 6  0]]
average precision:  0.0
average recall:  0.0



In [159]:









    Out[159]:





array([ True,  True,  True,  True,  True], dtype=bool)



In [ ]:

	salary	to_messages	deferral_payments	total_payments	exercised_stock_options	bonus	restricted_stock	shared_receipt_with_poi	restricted_stock_deferred	total_stock_value	...	loan_advances	from_messages	other	from_this_person_to_poi	poi	director_fees	deferred_income	long_term_incentive	email_address	from_poi_to_this_person
ALLEN PHILLIP K	201955	2902	2869717	4484442	1729541	4175000	126027	1407	-126027	1729541	...	NaN	2195	152	65	False	NaN	-3081055	304805	phillip.allen@enron.com	47
BADUM JAMES P	NaN	NaN	178980	182466	257817	NaN	NaN	NaN	NaN	257817	...	NaN	NaN	NaN	NaN	False	NaN	NaN	NaN	NaN	NaN
BANNANTINE JAMES M	477	566	NaN	916197	4046157	NaN	1757552	465	-560222	5243487	...	NaN	29	864523	0	False	NaN	-5104	NaN	james.bannantine@enron.com	39
BAXTER JOHN C	267102	NaN	1295738	5634343	6680544	1200000	3942714	NaN	NaN	10623258	...	NaN	NaN	2660303	NaN	False	NaN	-1386055	1586055	NaN	NaN
BAY FRANKLIN R	239671	NaN	260455	827696	NaN	400000	145796	NaN	-82782	63014	...	NaN	NaN	69	NaN	False	NaN	-201641	NaN	frank.bay@enron.com	NaN

	salary	to_messages	deferral_payments	total_payments	exercised_stock_options	bonus	restricted_stock	shared_receipt_with_poi	restricted_stock_deferred	total_stock_value	...	loan_advances	from_messages	other	from_this_person_to_poi	poi	director_fees	deferred_income	long_term_incentive	email_address	from_poi_to_this_person
GRAMM WENDY L	NaN	NaN	NaN	119292	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	False	119292	NaN	NaN	NaN	NaN
LOCKHART EUGENE E	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	False	NaN	NaN	NaN	NaN	NaN
THE TRAVEL AGENCY IN THE PARK	NaN	NaN	NaN	362096	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	362096	NaN	False	NaN	NaN	NaN	NaN	NaN
WHALEY DAVID A	NaN	NaN	NaN	NaN	98718	NaN	NaN	NaN	NaN	98718	...	NaN	NaN	NaN	NaN	False	NaN	NaN	NaN	NaN	NaN
WROBEL BRUCE	NaN	NaN	NaN	NaN	139130	NaN	NaN	NaN	NaN	139130	...	NaN	NaN	NaN	NaN	False	NaN	NaN	NaN	NaN	NaN

	salary	to_messages	deferral_payments	total_payments	exercised_stock_options	bonus	restricted_stock	shared_receipt_with_poi	restricted_stock_deferred	total_stock_value	expenses	loan_advances	from_messages	other	from_this_person_to_poi	poi	director_fees	deferred_income	long_term_incentive	from_poi_to_this_person
count	140.000000	140.000000	140.000000	1.400000e+02	140.000000	140.000000	140.000000	140.000000	140.000000	140.000000	140.000000	140.000000	140.000000	140.000000	140.000000	140	140.000000	140.000000	140.000000	140.000000
mean	190744.492857	1273.942857	228434.971429	2.320163e+06	2133411.692857	695311.564286	893351.614286	722.685714	75515.557143	2991223.428571	36386.064286	599464.285714	373.971429	303166.835714	25.328571	0.1285714	9413.385714	-199217.078571	346585.200000	39.864286
std	197294.093628	2259.506178	763919.760540	8.965398e+06	4851626.749684	1245353.853038	2039916.501419	1085.974496	1320523.687009	6258245.359442	45551.830946	6890811.098023	1470.070002	1146367.057329	80.807030	0.3359269	30322.956715	613764.992934	694584.937036	75.042263
min	0.000000	0.000000	-102500.000000	0.000000e+00	0.000000	0.000000	-2604490.000000	0.000000	-1787380.000000	-44093.000000	0.000000	0.000000	0.000000	0.000000	0.000000	False	0.000000	-3504386.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	1.092718e+05	0.000000	0.000000	68704.500000	0.000000	0.000000	337227.250000	0.000000	0.000000	0.000000	0.000000	0.000000	0	0.000000	-39072.000000	0.000000	0.000000
50%	211816.000000	430.000000	0.000000	9.842120e+05	644048.000000	312500.000000	364374.000000	118.000000	0.000000	986016.500000	22614.000000	0.000000	18.500000	1038.500000	0.000000	0	0.000000	0.000000	0.000000	7.500000
75%	271801.500000	1689.250000	11840.750000	1.985668e+06	1735597.250000	812500.000000	857103.000000	1042.000000	0.000000	2372703.250000	54234.500000	0.000000	56.750000	150507.500000	14.000000	0	0.000000	0.000000	375304.000000	42.000000
max	1111258.000000	15149.000000	6426990.000000	1.035598e+08	34348384.000000	8000000.000000	14761694.000000	5521.000000	15456290.000000	49110078.000000	228763.000000	81525000.000000	14368.000000	10359729.000000	609.000000	True	137864.000000	0.000000	5145434.000000	528.000000